Visit With Us is a compnay that is looking to expand their customer base within the the Tourism Sector. They offer travel packages to customers and currently offer up 5 types:
The previous purchase rate they have experienced is approximately 18%. During the last campaign the randomly contacted customers wihtout targeting based on information about the customers. This time they are releasing a new product, a Wellness Tourism Package. This package is a travel package designed to help the customer kick-start a healthy lifestyle/support ones well-being.
"""Packages imported for the analysis"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 200)
sns.set_style("darkgrid")
import scipy.stats as stats
import warnings
warnings.filterwarnings('ignore')
from beepy import beep
%matplotlib inline
# Modeling Packaages
from sklearn import metrics
from sklearn import tree
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier
from xgboost import XGBClassifier
"""Import dataset and format into a basic dataframe"""
data = pd.read_excel("Tourism.xlsx", sheet_name='Tourism')
raw_df = data.copy()
del data
"""Initial assessment of data type, organization, and Nans"""
# The number of Nans within each feature
raw_df.isna().sum()
CustomerID 0 ProdTaken 0 Age 226 TypeofContact 25 CityTier 0 DurationOfPitch 251 Occupation 0 Gender 0 NumberOfPersonVisiting 0 NumberOfFollowups 45 ProductPitched 0 PreferredPropertyStar 26 MaritalStatus 0 NumberOfTrips 140 Passport 0 PitchSatisfactionScore 0 OwnCar 0 NumberOfChildrenVisiting 66 Designation 0 MonthlyIncome 233 dtype: int64
raw_df.isna().sum()[raw_df.isna().sum() > 0]
Age 226 TypeofContact 25 DurationOfPitch 251 NumberOfFollowups 45 PreferredPropertyStar 26 NumberOfTrips 140 NumberOfChildrenVisiting 66 MonthlyIncome 233 dtype: int64
# The datatypes and counts (info) for the dataframe
raw_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4888 entries, 0 to 4887 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerID 4888 non-null int64 1 ProdTaken 4888 non-null int64 2 Age 4662 non-null float64 3 TypeofContact 4863 non-null object 4 CityTier 4888 non-null int64 5 DurationOfPitch 4637 non-null float64 6 Occupation 4888 non-null object 7 Gender 4888 non-null object 8 NumberOfPersonVisiting 4888 non-null int64 9 NumberOfFollowups 4843 non-null float64 10 ProductPitched 4888 non-null object 11 PreferredPropertyStar 4862 non-null float64 12 MaritalStatus 4888 non-null object 13 NumberOfTrips 4748 non-null float64 14 Passport 4888 non-null int64 15 PitchSatisfactionScore 4888 non-null int64 16 OwnCar 4888 non-null int64 17 NumberOfChildrenVisiting 4822 non-null float64 18 Designation 4888 non-null object 19 MonthlyIncome 4655 non-null float64 dtypes: float64(7), int64(7), object(6) memory usage: 763.9+ KB
num_cols = pd.Series(raw_df.select_dtypes(include=np.number).columns)
cat_cols = pd.Series(raw_df.select_dtypes(exclude=np.number).columns)
print(f"The numerical data is: \n{num_cols}")
print('*'*50)
print(f"The categorical data is: \n{cat_cols}")
The numerical data is: 0 CustomerID 1 ProdTaken 2 Age 3 CityTier 4 DurationOfPitch 5 NumberOfPersonVisiting 6 NumberOfFollowups 7 PreferredPropertyStar 8 NumberOfTrips 9 Passport 10 PitchSatisfactionScore 11 OwnCar 12 NumberOfChildrenVisiting 13 MonthlyIncome dtype: object ************************************************** The categorical data is: 0 TypeofContact 1 Occupation 2 Gender 3 ProductPitched 4 MaritalStatus 5 Designation dtype: object
# Examine sample rows of the actual data
raw_df.sample(7)
| CustomerID | ProdTaken | Age | TypeofContact | CityTier | DurationOfPitch | Occupation | Gender | NumberOfPersonVisiting | NumberOfFollowups | ProductPitched | PreferredPropertyStar | MaritalStatus | NumberOfTrips | Passport | PitchSatisfactionScore | OwnCar | NumberOfChildrenVisiting | Designation | MonthlyIncome | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1669 | 201669 | 0 | 38.0 | Self Enquiry | 1 | 35.0 | Large Business | Male | 2 | 3.0 | Deluxe | 3.0 | Married | 4.0 | 0 | 1 | 0 | 0.0 | Manager | 21261.0 |
| 3284 | 203284 | 0 | 36.0 | Self Enquiry | 1 | 7.0 | Small Business | Male | 3 | 5.0 | Basic | 3.0 | Divorced | 8.0 | 0 | 2 | 1 | 2.0 | Executive | 20936.0 |
| 3176 | 203176 | 1 | 31.0 | Company Invited | 1 | 12.0 | Small Business | Female | 3 | 4.0 | Basic | 4.0 | Single | 2.0 | 0 | 3 | 0 | 1.0 | Executive | 21100.0 |
| 3480 | 203480 | 1 | 32.0 | Company Invited | 1 | 9.0 | Salaried | Male | 4 | 4.0 | Basic | 3.0 | Divorced | 8.0 | 1 | 3 | 1 | 2.0 | Executive | 22400.0 |
| 1291 | 201291 | 1 | NaN | Self Enquiry | 1 | 16.0 | Small Business | Male | 2 | 3.0 | Deluxe | 5.0 | Single | 2.0 | 0 | 4 | 0 | 0.0 | Manager | NaN |
| 876 | 200876 | 0 | 20.0 | Self Enquiry | 3 | 27.0 | Small Business | Male | 2 | 1.0 | Basic | 3.0 | Single | 2.0 | 0 | 3 | 0 | 0.0 | Executive | 17678.0 |
| 4301 | 204301 | 0 | 41.0 | Self Enquiry | 3 | 9.0 | Small Business | Female | 3 | 4.0 | Deluxe | 4.0 | Married | 2.0 | 0 | 1 | 0 | 1.0 | Manager | 24393.0 |
# Check for duplicate rows and check on shape
print(f"There are {raw_df.duplicated().sum()} duplicate data entries.")
raw_df.shape
There are 0 duplicate data entries.
(4888, 20)
# Assess number of unique values in for each numerical feature
for i in num_cols:
print(f"The number of unique values in {i} are: {raw_df[i].nunique()}")
The number of unique values in CustomerID are: 4888 The number of unique values in ProdTaken are: 2 The number of unique values in Age are: 44 The number of unique values in CityTier are: 3 The number of unique values in DurationOfPitch are: 34 The number of unique values in NumberOfPersonVisiting are: 5 The number of unique values in NumberOfFollowups are: 6 The number of unique values in PreferredPropertyStar are: 3 The number of unique values in NumberOfTrips are: 12 The number of unique values in Passport are: 2 The number of unique values in PitchSatisfactionScore are: 5 The number of unique values in OwnCar are: 2 The number of unique values in NumberOfChildrenVisiting are: 4 The number of unique values in MonthlyIncome are: 2475
# Assess number of unique values in for each categorical feature
for i in cat_cols:
print(f"The number of unique values in {i} are: {raw_df[i].nunique()}")
The number of unique values in TypeofContact are: 2 The number of unique values in Occupation are: 4 The number of unique values in Gender are: 3 The number of unique values in ProductPitched are: 5 The number of unique values in MaritalStatus are: 4 The number of unique values in Designation are: 5
"""Solve Nans within the dataset and organize it for more thorough EDA"""
df_clean = raw_df.copy()
del raw_df
df_clean.isna().sum()[df_clean.isna().sum() > 0]
Age 226 TypeofContact 25 DurationOfPitch 251 NumberOfFollowups 45 PreferredPropertyStar 26 NumberOfTrips 140 NumberOfChildrenVisiting 66 MonthlyIncome 233 dtype: int64
sns.heatmap(df_clean.isna())
<AxesSubplot:>
There are not strong patterns of the Nan valueswithin the data. Therefore each feature will likely need to be assessed individually.
"""Percent of Nans for each feature are all low"""
df_clean.isna().sum()[df_clean.isna().sum() > 0]/df_clean.shape[0] * 100
Age 4.623568 TypeofContact 0.511457 DurationOfPitch 5.135025 NumberOfFollowups 0.920622 PreferredPropertyStar 0.531915 NumberOfTrips 2.864157 NumberOfChildrenVisiting 1.350245 MonthlyIncome 4.766776 dtype: float64
Interpolation is unlikely to greatly influence any of the features since none have a proportion of Nans significantly over 5% of the data entries. Will need to check if there are correlations that could be used to interpolate based on relationships to other related features or if median and mode are the best routes.
TypeofContact is the only categorical variable, all the others are numerical that have Nans.
"""Heatmap of Pearson Correlations for only numerical features with Nans"""
nan_cols = df_clean.isna().sum()[df_clean.isna().sum() > 0].index.tolist()
nan_cols.remove('TypeofContact')
sns.heatmap(df_clean[nan_cols].corr(), vmin=-1, vmax=1, cmap='coolwarm', annot=True)
<AxesSubplot:>
There are no major linear relationships preset within the data. The highest is a 0.46 realationship between Age and MonthlyIncome, which could be worth assessing more carefully.
sns.pairplot(df_clean[nan_cols])
<seaborn.axisgrid.PairGrid at 0x19d6f5e6cc8>
sns.regplot(data=df_clean, x = "Age", y = "MonthlyIncome")
<AxesSubplot:xlabel='Age', ylabel='MonthlyIncome'>
There are no major correlations in the numerical columns with Nans. There is a very slight one between Age and Monthly Income, but nothing likely to help dramatically with interpreting Nans.
#Replace all Nans in numerical columns with the median of that column
for i in range(len(nan_cols)):
x = df_clean[nan_cols[i]].median()
df_clean[nan_cols[i]].fillna(x, inplace=True)
x = df_clean.TypeofContact.value_counts().index[0]
df_clean.TypeofContact.fillna(x, inplace=True)
sns.heatmap(df_clean.isna())
<AxesSubplot:>
df_clean.describe(include=np.number)
| CustomerID | ProdTaken | Age | CityTier | DurationOfPitch | NumberOfPersonVisiting | NumberOfFollowups | PreferredPropertyStar | NumberOfTrips | Passport | PitchSatisfactionScore | OwnCar | NumberOfChildrenVisiting | MonthlyIncome | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 4888.000000 | 4888.000000 | 4888.000000 | 4888.000000 | 4888.000000 | 4888.000000 | 4888.000000 | 4888.000000 | 4888.000000 | 4888.000000 | 4888.000000 | 4888.000000 | 4888.000000 | 4888.000000 |
| mean | 202443.500000 | 0.188216 | 37.547259 | 1.654255 | 15.362930 | 2.905074 | 3.711129 | 3.577946 | 3.229746 | 0.290917 | 3.078151 | 0.620295 | 1.184738 | 23559.179419 |
| std | 1411.188388 | 0.390925 | 9.104795 | 0.916583 | 8.316166 | 0.724891 | 0.998271 | 0.797005 | 1.822769 | 0.454232 | 1.365792 | 0.485363 | 0.852323 | 5257.862921 |
| min | 200000.000000 | 0.000000 | 18.000000 | 1.000000 | 5.000000 | 1.000000 | 1.000000 | 3.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 1000.000000 |
| 25% | 201221.750000 | 0.000000 | 31.000000 | 1.000000 | 9.000000 | 2.000000 | 3.000000 | 3.000000 | 2.000000 | 0.000000 | 2.000000 | 0.000000 | 1.000000 | 20485.000000 |
| 50% | 202443.500000 | 0.000000 | 36.000000 | 1.000000 | 13.000000 | 3.000000 | 4.000000 | 3.000000 | 3.000000 | 0.000000 | 3.000000 | 1.000000 | 1.000000 | 22347.000000 |
| 75% | 203665.250000 | 0.000000 | 43.000000 | 3.000000 | 19.000000 | 3.000000 | 4.000000 | 4.000000 | 4.000000 | 1.000000 | 4.000000 | 1.000000 | 2.000000 | 25424.750000 |
| max | 204887.000000 | 1.000000 | 61.000000 | 3.000000 | 127.000000 | 5.000000 | 6.000000 | 5.000000 | 22.000000 | 1.000000 | 5.000000 | 1.000000 | 3.000000 | 98678.000000 |
def hist_boxplot(df, feature, figsize=(12, 7), kde=False):
"""
Boxplot and histogram plotted on same axes for easy comparison
for a single variable.
df: dataframe for data
feature: column of interest
figsize: size of figure
kde: whether to plot the kde for the histogram
bins: number of bins for the histogram
"""
fig, (ax_box2, ax_hist2) = plt.subplots(nrows=2, sharex=True,
gridspec_kw={'height_ratios': (0.25, 0.75)},
figsize=figsize)
sns.boxplot(data=df, x=feature, ax=ax_box2, showmeans=True)
sns.histplot(data=df, x=feature, kde=kde, ax=ax_hist2)
ax_hist2.axvline(df[feature].median(), color='black', linestyle='-')
ax_hist2.axvline(df[feature].mean(), color='green', linestyle='--')
"""Closer examination of numerical features with boxplot and histograms"""
num_cols = df_clean.select_dtypes(include=np.number).columns.tolist()
for i in range(len(num_cols)):
hist_boxplot(df_clean, num_cols[i])
# Examine Duration of Pitch outlier entries
df_clean[df_clean.DurationOfPitch > 40]
| CustomerID | ProdTaken | Age | TypeofContact | CityTier | DurationOfPitch | Occupation | Gender | NumberOfPersonVisiting | NumberOfFollowups | ProductPitched | PreferredPropertyStar | MaritalStatus | NumberOfTrips | Passport | PitchSatisfactionScore | OwnCar | NumberOfChildrenVisiting | Designation | MonthlyIncome | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1434 | 201434 | 0 | 36.0 | Company Invited | 3 | 126.0 | Salaried | Male | 2 | 3.0 | Basic | 3.0 | Married | 3.0 | 0 | 1 | 1 | 1.0 | Executive | 18482.0 |
| 3878 | 203878 | 0 | 53.0 | Company Invited | 3 | 127.0 | Salaried | Male | 3 | 4.0 | Basic | 3.0 | Married | 4.0 | 0 | 1 | 1 | 2.0 | Executive | 22160.0 |
# Examine Number of Trips outlier entries
df_clean[df_clean.NumberOfTrips > 15]
| CustomerID | ProdTaken | Age | TypeofContact | CityTier | DurationOfPitch | Occupation | Gender | NumberOfPersonVisiting | NumberOfFollowups | ProductPitched | PreferredPropertyStar | MaritalStatus | NumberOfTrips | Passport | PitchSatisfactionScore | OwnCar | NumberOfChildrenVisiting | Designation | MonthlyIncome | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 385 | 200385 | 1 | 30.0 | Company Invited | 1 | 10.0 | Large Business | Male | 2 | 3.0 | Basic | 3.0 | Single | 19.0 | 1 | 4 | 1 | 1.0 | Executive | 17285.0 |
| 816 | 200816 | 0 | 39.0 | Company Invited | 1 | 15.0 | Salaried | Male | 3 | 3.0 | Deluxe | 4.0 | Unmarried | 21.0 | 0 | 2 | 1 | 0.0 | Manager | 21782.0 |
| 2829 | 202829 | 1 | 31.0 | Company Invited | 1 | 11.0 | Large Business | Male | 3 | 4.0 | Basic | 3.0 | Single | 20.0 | 1 | 4 | 1 | 2.0 | Executive | 20963.0 |
| 3260 | 203260 | 0 | 40.0 | Company Invited | 1 | 16.0 | Salaried | Male | 4 | 4.0 | Deluxe | 4.0 | Unmarried | 22.0 | 0 | 2 | 1 | 1.0 | Manager | 25460.0 |
# Examine Monthly Income low outlier entries
df_clean[df_clean.MonthlyIncome < 10000]
| CustomerID | ProdTaken | Age | TypeofContact | CityTier | DurationOfPitch | Occupation | Gender | NumberOfPersonVisiting | NumberOfFollowups | ProductPitched | PreferredPropertyStar | MaritalStatus | NumberOfTrips | Passport | PitchSatisfactionScore | OwnCar | NumberOfChildrenVisiting | Designation | MonthlyIncome | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 142 | 200142 | 0 | 38.0 | Self Enquiry | 1 | 9.0 | Large Business | Female | 2 | 3.0 | Deluxe | 3.0 | Single | 4.0 | 1 | 5 | 0 | 0.0 | Manager | 1000.0 |
| 2586 | 202586 | 0 | 39.0 | Self Enquiry | 1 | 10.0 | Large Business | Female | 3 | 4.0 | Deluxe | 3.0 | Single | 5.0 | 1 | 5 | 0 | 1.0 | Manager | 4678.0 |
# Examine Monthly Income high outlier entries
df_clean[df_clean.MonthlyIncome > 80000]
| CustomerID | ProdTaken | Age | TypeofContact | CityTier | DurationOfPitch | Occupation | Gender | NumberOfPersonVisiting | NumberOfFollowups | ProductPitched | PreferredPropertyStar | MaritalStatus | NumberOfTrips | Passport | PitchSatisfactionScore | OwnCar | NumberOfChildrenVisiting | Designation | MonthlyIncome | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 38 | 200038 | 0 | 36.0 | Self Enquiry | 1 | 11.0 | Salaried | Female | 2 | 4.0 | Basic | 3.0 | Divorced | 1.0 | 1 | 2 | 1 | 0.0 | Executive | 95000.0 |
| 2482 | 202482 | 0 | 37.0 | Self Enquiry | 1 | 12.0 | Salaried | Female | 3 | 5.0 | Basic | 5.0 | Divorced | 2.0 | 1 | 2 | 1 | 1.0 | Executive | 98678.0 |
"""Solve outliers found in numerical data"""
# DurationOfPitch
Q3 = df_clean.DurationOfPitch.quantile(q=0.75) # Find value of upper quartile
df_clean.DurationOfPitch[df_clean.DurationOfPitch > 40] = Q3 # Replace values with the value of the upper quartile
# NumberOfTrips
Q3 = df_clean.NumberOfTrips.quantile(q=0.75) # Find value of upper quartile
df_clean.NumberOfTrips[df_clean.NumberOfTrips > 15] = Q3 # Replace values with the value of the upper quartile
# MonthlyIncome
Q2 = df_clean.MonthlyIncome.median() # Find value of the median
Q3 = df_clean.MonthlyIncome.quantile(q=0.75) # Find value of upper quartile
df_clean.MonthlyIncome[df_clean.MonthlyIncome == 1000] = Q2 # Replace low outlier value with the median
df_clean.MonthlyIncome[df_clean.MonthlyIncome > 80000] = Q3 # Replace low outlier value with the upper quartile
"""Re-examination of numerical features with boxplot and histograms"""
for i in range(len(num_cols)):
hist_boxplot(df_clean, num_cols[i])
df_clean.describe(include=np.number)
| CustomerID | ProdTaken | Age | CityTier | DurationOfPitch | NumberOfPersonVisiting | NumberOfFollowups | PreferredPropertyStar | NumberOfTrips | Passport | PitchSatisfactionScore | OwnCar | NumberOfChildrenVisiting | MonthlyIncome | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 4888.000000 | 4888.000000 | 4888.000000 | 4888.000000 | 4888.000000 | 4888.000000 | 4888.000000 | 4888.000000 | 4888.000000 | 4888.000000 | 4888.000000 | 4888.000000 | 4888.000000 | 4888.000000 |
| mean | 202443.500000 | 0.188216 | 37.547259 | 1.654255 | 15.318944 | 2.905074 | 3.711129 | 3.577946 | 3.216244 | 0.290917 | 3.078151 | 0.620295 | 1.184738 | 23534.326412 |
| std | 1411.188388 | 0.390925 | 9.104795 | 0.916583 | 8.006696 | 0.724891 | 0.998271 | 0.797005 | 1.754321 | 0.454232 | 1.365792 | 0.485363 | 0.852323 | 5034.190039 |
| min | 200000.000000 | 0.000000 | 18.000000 | 1.000000 | 5.000000 | 1.000000 | 1.000000 | 3.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 4678.000000 |
| 25% | 201221.750000 | 0.000000 | 31.000000 | 1.000000 | 9.000000 | 2.000000 | 3.000000 | 3.000000 | 2.000000 | 0.000000 | 2.000000 | 0.000000 | 1.000000 | 20485.750000 |
| 50% | 202443.500000 | 0.000000 | 36.000000 | 1.000000 | 13.000000 | 3.000000 | 4.000000 | 3.000000 | 3.000000 | 0.000000 | 3.000000 | 1.000000 | 1.000000 | 22347.000000 |
| 75% | 203665.250000 | 0.000000 | 43.000000 | 3.000000 | 19.000000 | 3.000000 | 4.000000 | 4.000000 | 4.000000 | 1.000000 | 4.000000 | 1.000000 | 2.000000 | 25422.687500 |
| max | 204887.000000 | 1.000000 | 61.000000 | 3.000000 | 36.000000 | 5.000000 | 6.000000 | 5.000000 | 8.000000 | 1.000000 | 5.000000 | 1.000000 | 3.000000 | 38677.000000 |
"""Closer examination of the categorical data for errors"""
cat_cols = df_clean.select_dtypes(exclude=np.number).columns.tolist()
for i in cat_cols:
s = pd.Series(df_clean[i].unique(), name=i)
print(i)
print(s)
print('*'*50)
TypeofContact 0 Self Enquiry 1 Company Invited Name: TypeofContact, dtype: object ************************************************** Occupation 0 Salaried 1 Free Lancer 2 Small Business 3 Large Business Name: Occupation, dtype: object ************************************************** Gender 0 Female 1 Male 2 Fe Male Name: Gender, dtype: object ************************************************** ProductPitched 0 Deluxe 1 Basic 2 Standard 3 Super Deluxe 4 King Name: ProductPitched, dtype: object ************************************************** MaritalStatus 0 Single 1 Divorced 2 Married 3 Unmarried Name: MaritalStatus, dtype: object ************************************************** Designation 0 Manager 1 Executive 2 Senior Manager 3 AVP 4 VP Name: Designation, dtype: object **************************************************
df_clean.describe(exclude=np.number)
| TypeofContact | Occupation | Gender | ProductPitched | MaritalStatus | Designation | |
|---|---|---|---|---|---|---|
| count | 4888 | 4888 | 4888 | 4888 | 4888 | 4888 |
| unique | 2 | 4 | 3 | 5 | 4 | 5 |
| top | Self Enquiry | Salaried | Male | Basic | Married | Executive |
| freq | 3469 | 2368 | 2916 | 1842 | 2340 | 1842 |
df_clean.Gender.replace({"Fe Male":"Female"}, inplace=True)
df_clean.Gender.unique()
array(['Female', 'Male'], dtype=object)
df_clean.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4888 entries, 0 to 4887 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerID 4888 non-null int64 1 ProdTaken 4888 non-null int64 2 Age 4888 non-null float64 3 TypeofContact 4888 non-null object 4 CityTier 4888 non-null int64 5 DurationOfPitch 4888 non-null float64 6 Occupation 4888 non-null object 7 Gender 4888 non-null object 8 NumberOfPersonVisiting 4888 non-null int64 9 NumberOfFollowups 4888 non-null float64 10 ProductPitched 4888 non-null object 11 PreferredPropertyStar 4888 non-null float64 12 MaritalStatus 4888 non-null object 13 NumberOfTrips 4888 non-null float64 14 Passport 4888 non-null int64 15 PitchSatisfactionScore 4888 non-null int64 16 OwnCar 4888 non-null int64 17 NumberOfChildrenVisiting 4888 non-null float64 18 Designation 4888 non-null object 19 MonthlyIncome 4888 non-null float64 dtypes: float64(7), int64(7), object(6) memory usage: 763.9+ KB
df_clean.set_index('CustomerID', inplace=True)
integers = ['Age', 'NumberOfFollowups', 'PreferredPropertyStar', 'NumberOfTrips', 'NumberOfChildrenVisiting']
for i in integers:
df_clean[i]= df_clean[i].astype('int64')
categories = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched', 'MaritalStatus', 'Designation']
for i in categories:
df_clean[i] = df_clean[i].astype('category')
df_clean.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 4888 entries, 200000 to 204887 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ProdTaken 4888 non-null int64 1 Age 4888 non-null int64 2 TypeofContact 4888 non-null category 3 CityTier 4888 non-null int64 4 DurationOfPitch 4888 non-null float64 5 Occupation 4888 non-null category 6 Gender 4888 non-null category 7 NumberOfPersonVisiting 4888 non-null int64 8 NumberOfFollowups 4888 non-null int64 9 ProductPitched 4888 non-null category 10 PreferredPropertyStar 4888 non-null int64 11 MaritalStatus 4888 non-null category 12 NumberOfTrips 4888 non-null int64 13 Passport 4888 non-null int64 14 PitchSatisfactionScore 4888 non-null int64 15 OwnCar 4888 non-null int64 16 NumberOfChildrenVisiting 4888 non-null int64 17 Designation 4888 non-null category 18 MonthlyIncome 4888 non-null float64 dtypes: category(6), float64(2), int64(11) memory usage: 564.3 KB
df_clean.sample(7)
| ProdTaken | Age | TypeofContact | CityTier | DurationOfPitch | Occupation | Gender | NumberOfPersonVisiting | NumberOfFollowups | ProductPitched | PreferredPropertyStar | MaritalStatus | NumberOfTrips | Passport | PitchSatisfactionScore | OwnCar | NumberOfChildrenVisiting | Designation | MonthlyIncome | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| CustomerID | |||||||||||||||||||
| 200017 | 0 | 22 | Company Invited | 1 | 22.0 | Small Business | Male | 3 | 2 | Basic | 3 | Married | 2 | 1 | 3 | 0 | 0 | Executive | 17935.0 |
| 200189 | 0 | 51 | Self Enquiry | 1 | 9.0 | Small Business | Female | 3 | 3 | Super Deluxe | 4 | Single | 4 | 0 | 5 | 0 | 1 | AVP | 28734.0 |
| 201437 | 0 | 39 | Company Invited | 1 | 15.0 | Salaried | Male | 2 | 3 | King | 3 | Single | 3 | 0 | 4 | 0 | 1 | VP | 34431.0 |
| 201025 | 0 | 36 | Self Enquiry | 3 | 10.0 | Small Business | Female | 2 | 3 | Deluxe | 3 | Divorced | 2 | 1 | 5 | 1 | 0 | Manager | 22347.0 |
| 202132 | 1 | 20 | Self Enquiry | 3 | 8.0 | Small Business | Female | 2 | 4 | Basic | 3 | Single | 2 | 0 | 4 | 1 | 0 | Executive | 17044.0 |
| 202703 | 1 | 32 | Company Invited | 1 | 36.0 | Small Business | Male | 4 | 5 | Basic | 4 | Unmarried | 2 | 0 | 3 | 1 | 3 | Executive | 22157.0 |
| 200451 | 0 | 42 | Self Enquiry | 1 | 30.0 | Small Business | Male | 2 | 3 | Standard | 5 | Divorced | 2 | 1 | 2 | 1 | 1 | Senior Manager | 22406.0 |
"""New Dataframe with all cleaning for data exploration"""
df_eda = df_clean.copy()
del df_clean
# Table of general descriptive statistics for all features
df_eda.describe(include='all')
| ProdTaken | Age | TypeofContact | CityTier | DurationOfPitch | Occupation | Gender | NumberOfPersonVisiting | NumberOfFollowups | ProductPitched | PreferredPropertyStar | MaritalStatus | NumberOfTrips | Passport | PitchSatisfactionScore | OwnCar | NumberOfChildrenVisiting | Designation | MonthlyIncome | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 4888.000000 | 4888.000000 | 4888 | 4888.000000 | 4888.000000 | 4888 | 4888 | 4888.000000 | 4888.000000 | 4888 | 4888.000000 | 4888 | 4888.000000 | 4888.000000 | 4888.000000 | 4888.000000 | 4888.000000 | 4888 | 4888.000000 |
| unique | NaN | NaN | 2 | NaN | NaN | 4 | 2 | NaN | NaN | 5 | NaN | 4 | NaN | NaN | NaN | NaN | NaN | 5 | NaN |
| top | NaN | NaN | Self Enquiry | NaN | NaN | Salaried | Male | NaN | NaN | Basic | NaN | Married | NaN | NaN | NaN | NaN | NaN | Executive | NaN |
| freq | NaN | NaN | 3469 | NaN | NaN | 2368 | 2916 | NaN | NaN | 1842 | NaN | 2340 | NaN | NaN | NaN | NaN | NaN | 1842 | NaN |
| mean | 0.188216 | 37.547259 | NaN | 1.654255 | 15.318944 | NaN | NaN | 2.905074 | 3.711129 | NaN | 3.577946 | NaN | 3.216244 | 0.290917 | 3.078151 | 0.620295 | 1.184738 | NaN | 23534.326412 |
| std | 0.390925 | 9.104795 | NaN | 0.916583 | 8.006696 | NaN | NaN | 0.724891 | 0.998271 | NaN | 0.797005 | NaN | 1.754321 | 0.454232 | 1.365792 | 0.485363 | 0.852323 | NaN | 5034.190039 |
| min | 0.000000 | 18.000000 | NaN | 1.000000 | 5.000000 | NaN | NaN | 1.000000 | 1.000000 | NaN | 3.000000 | NaN | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | NaN | 4678.000000 |
| 25% | 0.000000 | 31.000000 | NaN | 1.000000 | 9.000000 | NaN | NaN | 2.000000 | 3.000000 | NaN | 3.000000 | NaN | 2.000000 | 0.000000 | 2.000000 | 0.000000 | 1.000000 | NaN | 20485.750000 |
| 50% | 0.000000 | 36.000000 | NaN | 1.000000 | 13.000000 | NaN | NaN | 3.000000 | 4.000000 | NaN | 3.000000 | NaN | 3.000000 | 0.000000 | 3.000000 | 1.000000 | 1.000000 | NaN | 22347.000000 |
| 75% | 0.000000 | 43.000000 | NaN | 3.000000 | 19.000000 | NaN | NaN | 3.000000 | 4.000000 | NaN | 4.000000 | NaN | 4.000000 | 1.000000 | 4.000000 | 1.000000 | 2.000000 | NaN | 25422.687500 |
| max | 1.000000 | 61.000000 | NaN | 3.000000 | 36.000000 | NaN | NaN | 5.000000 | 6.000000 | NaN | 5.000000 | NaN | 8.000000 | 1.000000 | 5.000000 | 1.000000 | 3.000000 | NaN | 38677.000000 |
df_eda[df_eda['ProdTaken'] == 1].describe(include='all')
| ProdTaken | Age | TypeofContact | CityTier | DurationOfPitch | Occupation | Gender | NumberOfPersonVisiting | NumberOfFollowups | ProductPitched | PreferredPropertyStar | MaritalStatus | NumberOfTrips | Passport | PitchSatisfactionScore | OwnCar | NumberOfChildrenVisiting | Designation | MonthlyIncome | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 920.0 | 920.000000 | 920 | 920.000000 | 920.000000 | 920 | 920 | 920.000000 | 920.000000 | 920 | 920.000000 | 920 | 920.000000 | 920.000000 | 920.000000 | 920.000000 | 920.000000 | 920 | 920.000000 |
| unique | NaN | NaN | 2 | NaN | NaN | 4 | 2 | NaN | NaN | 5 | NaN | 4 | NaN | NaN | NaN | NaN | NaN | 5 | NaN |
| top | NaN | NaN | Self Enquiry | NaN | NaN | Salaried | Male | NaN | NaN | Basic | NaN | Married | NaN | NaN | NaN | NaN | NaN | Executive | NaN |
| freq | NaN | NaN | 610 | NaN | NaN | 414 | 578 | NaN | NaN | 552 | NaN | 326 | NaN | NaN | NaN | NaN | NaN | 552 | NaN |
| mean | 1.0 | 34.829348 | NaN | 1.819565 | 16.683696 | NaN | NaN | 2.919565 | 3.942391 | NaN | 3.741304 | NaN | 3.270652 | 0.536957 | 3.223913 | 0.608696 | 1.198913 | NaN | 22178.885870 |
| std | 0.0 | 9.828021 | NaN | 0.958354 | 8.305049 | NaN | NaN | 0.702899 | 1.002688 | NaN | 0.857934 | NaN | 1.890244 | 0.498904 | 1.327648 | 0.488308 | 0.861837 | NaN | 4567.349943 |
| min | 1.0 | 18.000000 | NaN | 1.000000 | 6.000000 | NaN | NaN | 2.000000 | 1.000000 | NaN | 3.000000 | NaN | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | NaN | 16009.000000 |
| 25% | 1.0 | 28.000000 | NaN | 1.000000 | 10.000000 | NaN | NaN | 2.000000 | 3.000000 | NaN | 3.000000 | NaN | 2.000000 | 0.000000 | 2.000000 | 0.000000 | 1.000000 | NaN | 18475.250000 |
| 50% | 1.0 | 33.000000 | NaN | 1.000000 | 15.000000 | NaN | NaN | 3.000000 | 4.000000 | NaN | 3.000000 | NaN | 3.000000 | 1.000000 | 3.000000 | 1.000000 | 1.000000 | NaN | 21274.000000 |
| 75% | 1.0 | 41.000000 | NaN | 3.000000 | 23.000000 | NaN | NaN | 3.000000 | 5.000000 | NaN | 5.000000 | NaN | 4.000000 | 1.000000 | 4.000000 | 1.000000 | 2.000000 | NaN | 23857.500000 |
| max | 1.0 | 60.000000 | NaN | 3.000000 | 36.000000 | NaN | NaN | 4.000000 | 6.000000 | NaN | 5.000000 | NaN | 8.000000 | 1.000000 | 5.000000 | 1.000000 | 3.000000 | NaN | 38537.000000 |
df_eda[df_eda['ProdTaken'] == 0].describe(include="all")
| ProdTaken | Age | TypeofContact | CityTier | DurationOfPitch | Occupation | Gender | NumberOfPersonVisiting | NumberOfFollowups | ProductPitched | PreferredPropertyStar | MaritalStatus | NumberOfTrips | Passport | PitchSatisfactionScore | OwnCar | NumberOfChildrenVisiting | Designation | MonthlyIncome | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 3968.0 | 3968.000000 | 3968 | 3968.000000 | 3968.000000 | 3968 | 3968 | 3968.000000 | 3968.000000 | 3968 | 3968.000000 | 3968 | 3968.000000 | 3968.000000 | 3968.000000 | 3968.000000 | 3968.000000 | 3968 | 3968.000000 |
| unique | NaN | NaN | 2 | NaN | NaN | 3 | 2 | NaN | NaN | 5 | NaN | 4 | NaN | NaN | NaN | NaN | NaN | 5 | NaN |
| top | NaN | NaN | Self Enquiry | NaN | NaN | Salaried | Male | NaN | NaN | Deluxe | NaN | Married | NaN | NaN | NaN | NaN | NaN | Manager | NaN |
| freq | NaN | NaN | 2859 | NaN | NaN | 1954 | 2338 | NaN | NaN | 1528 | NaN | 2014 | NaN | NaN | NaN | NaN | NaN | 1528 | NaN |
| mean | 0.0 | 38.177419 | NaN | 1.615927 | 15.002520 | NaN | NaN | 2.901714 | 3.657510 | NaN | 3.540071 | NaN | 3.203629 | 0.233871 | 3.044355 | 0.622984 | 1.181452 | NaN | 23848.591860 |
| std | 0.0 | 8.811116 | NaN | 0.902434 | 7.903406 | NaN | NaN | 0.729940 | 0.989682 | NaN | 0.777427 | NaN | 1.721282 | 0.423344 | 1.372439 | 0.484700 | 0.850178 | NaN | 5085.566743 |
| min | 0.0 | 18.000000 | NaN | 1.000000 | 5.000000 | NaN | NaN | 1.000000 | 1.000000 | NaN | 3.000000 | NaN | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | NaN | 4678.000000 |
| 25% | 0.0 | 32.000000 | NaN | 1.000000 | 9.000000 | NaN | NaN | 2.000000 | 3.000000 | NaN | 3.000000 | NaN | 2.000000 | 0.000000 | 2.000000 | 0.000000 | 1.000000 | NaN | 20761.750000 |
| 50% | 0.0 | 36.000000 | NaN | 1.000000 | 13.000000 | NaN | NaN | 3.000000 | 4.000000 | NaN | 3.000000 | NaN | 3.000000 | 0.000000 | 3.000000 | 1.000000 | 1.000000 | NaN | 22413.500000 |
| 75% | 0.0 | 44.000000 | NaN | 3.000000 | 18.000000 | NaN | NaN | 3.000000 | 4.000000 | NaN | 4.000000 | NaN | 4.000000 | 0.000000 | 4.000000 | 1.000000 | 2.000000 | NaN | 25741.250000 |
| max | 0.0 | 61.000000 | NaN | 3.000000 | 36.000000 | NaN | NaN | 5.000000 | 6.000000 | NaN | 5.000000 | NaN | 8.000000 | 1.000000 | 5.000000 | 1.000000 | 3.000000 | NaN | 38677.000000 |
df_eda.columns
Index(['ProdTaken', 'Age', 'TypeofContact', 'CityTier', 'DurationOfPitch',
'Occupation', 'Gender', 'NumberOfPersonVisiting', 'NumberOfFollowups',
'ProductPitched', 'PreferredPropertyStar', 'MaritalStatus',
'NumberOfTrips', 'Passport', 'PitchSatisfactionScore', 'OwnCar',
'NumberOfChildrenVisiting', 'Designation', 'MonthlyIncome'],
dtype='object')
All of these features are potentially relevant, but thinking about them along these lines will assist in exploring the data for potential trends. In part because it will limit the figures viewed at any one time
def barplot_annot(dataframe, feature, annot='percent',
n=None, h=0, w=0, s = 12):
"""
Barplot with percentage annotation of counts on top.
dataframe: dataframe containing relevant data
feature: columns in dataframe to be plotted
annot: whether to annotate plots with percent, count, or both
n: displays the top n catgory levels (default is None, i.e., display all levels)
h: vertical position of annotation above bar
w: lateral position of annotation above bar
s: fontsize for annotations
"""
rows = len(dataframe[feature])
count = dataframe[feature].nunique()
if n is None:
plt.figure(figsize=(count + 1, 5))
else:
plt.figure(figsize(n + 1, 5))
ax = sns.countplot(data=dataframe, x=feature,
order=dataframe[feature].value_counts().index[:n].sort_values())
for p in ax.patches:
if annot == 'percent':
label = "{:.1f}%".format(100 * p.get_height()/rows)
elif annot == 'count':
label = p.get_height()
else:
perc = "{:.1f}%".format(100 * p.get_height()/rows)
num = p.get_height()
label = "{} ({})".format(num, perc)
x = p.get_x() + p.get_width() / 2 - p.get_width() * w
y = p.get_y() + p.get_height() + p.get_height() * h
ax.annotate(label, (x, y), size = s)
plt.tight_layout()
plt.title(feature, fontsize=25)
sns.despine()
plt.show()
"""How often do customer actually purchase a product?"""
barplot_annot(df_eda, 'ProdTaken',h=0.01, w=0.5, s=13, annot='perc')
plt.savefig("Product Purchased")
<Figure size 432x288 with 0 Axes>
Customers only take a product 18.8% of the time. Out of 4888 customers only 920 purchased a travel package.
plt.figure(figsize=(6, 6))
ax = sns.countplot(data=df_eda, x='ProdTaken')
plt.title("Customers who Purchased Travel Plans", fontsize=16)
plt.xlabel("Product Purchased", fontsize=14)
plt.ylabel("Number of Customers", fontsize=14)
plt.xticks(ticks=[0, 1], labels=["No", "Yes"])
plt.tight_layout()
sns.despine()
plt.savefig("PurchasedPlan", dpi=300)
# Create column lists of features of customer and features of sale for easier comparisons
# Include the target in both!
customers = [
"Age",
"Gender",
"Occupation",
"NumberOfPersonVisiting",
"PreferredPropertyStar",
"MaritalStatus",
"NumberOfTrips",
"Passport",
"OwnCar",
"NumberOfChildrenVisiting",
"Designation",
"MonthlyIncome",
"ProdTaken"
]
sales = [
"TypeofContact",
"CityTier",
"DurationOfPitch",
"ProductPitched",
"NumberOfFollowups",
"PitchSatisfactionScore",
"ProdTaken"
]
sns.pairplot(df_eda, hue='ProdTaken', kind='reg', plot_kws={'scatter_kws':{'alpha':0.05}})
<seaborn.axisgrid.PairGrid at 0x19d068b12c8>
"""How might features of the customer be related to them purchasing a product?"""
sns.pairplot(df_eda[customers], hue='ProdTaken', kind='reg', plot_kws={'scatter_kws':{'alpha':0.05}})
<seaborn.axisgrid.PairGrid at 0x19d72349c88>
No dramatic clustering in any of the trends with regards to customers who took or did not purchase a product based on numerical features. Interestingly, you can see on the univariate distributions, ththe customers who took the package tend to be essentially the same, just at a lower frequency, which makes sense as most customers do not take a package. There are some potential differences to explore dmonstrated by the trend lines in these plots:
You can observe other trends, unrelated to whether a customer purchased or not. Unsurprisingly, Number of children visiting and number of people visiting appear to have a positive correlation, as children are still people, so will be counted in both. There also is a positive trend for Age and Monthly Income, but the relationship is not actually linear. Just it's lower bound on Age. Essentially, if you are older, the minimum income likely to be higher, but there appears to be no upper bound on monthly income related to age.
"""How might feature of the sale be related to a customer purchasing a product?"""
sns.pairplot(df_eda[sales], hue='ProdTaken', kind='reg', plot_kws={'scatter_kws':{'alpha':0.05}})
<seaborn.axisgrid.PairGrid at 0x19d7c8d2a88>
We observe the same effect with regards to sales features as we saw with customers. There are no obvious shifts in the likelihood of a customer purchasing a travel package based on a single numerical feature of the sale overall. There is an interesting overall trend though when looking at the position of trendlines of customers who purchased a plan versus those who did not. Slight increases in all sales metrics may lead to a slight increase in sales themselves. However, the difference does appear to be slight, but it is consistent that the trend line for customers who bought plans is shifted above those who did not slightly on all metrics.
plt.figure(figsize=(16, 12))
sns.heatmap(df_eda.corr(), vmin=-1, vmax=1, cmap="coolwarm", annot=True)
<AxesSubplot:>
count = df_eda["Occupation"].nunique()
sorter = df_eda["ProdTaken"].value_counts().index[-1]
tab1 = pd.crosstab(df_eda["Occupation"], df_eda["ProdTaken"], margins=True).sort_values(by=sorter, ascending=False)
print(tab1)
print("*"*120)
tab = pd.crosstab(df_eda["Occupation"], df_eda["ProdTaken"], normalize="index").sort_values(by=sorter, ascending=False)
tab.plot(kind="bar", stacked=True, figsize=(count + 5, 8))
plt.title("Precent of customers who purchase a plan", fontsize=16)
plt.xlabel("Occupation", fontsize=14)
plt.ylabel("Precent of Customers", fontsize=14)
plt.xticks(rotation=0)
plt.legend(loc="lower left", frameon=False)
plt.legend(loc="upper left", bbox_to_anchor=(1,1))
# plt.show()
plt.savefig("PercentCustomersPurchasePlanByOccupatoin", dpi=300)
ProdTaken 0 1 All Occupation All 3968 920 4888 Salaried 1954 414 2368 Small Business 1700 384 2084 Large Business 314 120 434 Free Lancer 0 2 2 ************************************************************************************************************************
plt.figure(figsize=(6, 6))
sns.countplot(data=df_eda, x="Occupation")
plt.title("Customer Occupations", fontsize=16)
plt.xlabel("Product Purchased", fontsize=14)
plt.ylabel("Number of Customers", fontsize=14)
plt.tight_layout()
sns.despine()
plt.savefig("OccupationCounts", dpi=300)
"""Examination of distributions of categorical data"""
for i in range(len(cat_cols)):
barplot_annot(df_eda, cat_cols[i], h=0.01, w=0.5, s=13, annot='both')
def stacked_barplot(df, predictor, target):
"""
Print the category counts and plot a stacked bar plot.
df: dataframe
predictor: independent variable
target: target (dependent) variable
"""
count = df[predictor].nunique()
sorter = df[target].value_counts().index[-1]
tab1 = pd.crosstab(df[predictor], df[target], margins=True).sort_values(by=sorter, ascending=False)
print(tab1)
print("*"*120)
tab = pd.crosstab(df[predictor], df[target], normalize="index").sort_values(by=sorter, ascending=False)
tab.plot(kind="bar", stacked=True, figsize=(count + 5, 5))
plt.legend(loc="lower left", frameon=False)
plt.legend(loc="upper left", bbox_to_anchor=(1,1))
plt.show()
for i in range(len(cat_cols)):
stacked_barplot(df_eda, cat_cols[i], 'ProdTaken')
ProdTaken 0 1 All TypeofContact All 3968 920 4888 Self Enquiry 2859 610 3469 Company Invited 1109 310 1419 ************************************************************************************************************************
ProdTaken 0 1 All Occupation All 3968 920 4888 Salaried 1954 414 2368 Small Business 1700 384 2084 Large Business 314 120 434 Free Lancer 0 2 2 ************************************************************************************************************************
ProdTaken 0 1 All Gender All 3968 920 4888 Male 2338 578 2916 Female 1630 342 1972 ************************************************************************************************************************
ProdTaken 0 1 All ProductPitched All 3968 920 4888 Basic 1290 552 1842 Deluxe 1528 204 1732 Standard 618 124 742 King 210 20 230 Super Deluxe 322 20 342 ************************************************************************************************************************
ProdTaken 0 1 All MaritalStatus All 3968 920 4888 Married 2014 326 2340 Single 612 304 916 Unmarried 516 166 682 Divorced 826 124 950 ************************************************************************************************************************
ProdTaken 0 1 All Designation All 3968 920 4888 Executive 1290 552 1842 Manager 1528 204 1732 Senior Manager 618 124 742 AVP 322 20 342 VP 210 20 230 ************************************************************************************************************************
"""Does the product pitched influence a customers purchase?"""
plt.figure(figsize=(17, 75))
for i in range(len(num_cols)):
plt.subplot(18, 3, i + 1)
sns.violinplot(data=df_eda, x='ProductPitched', y=num_cols[i], hue="ProdTaken", split=True)
plt.tight_layout()
plt.title(num_cols[i], fontsize=25)
sns.despine()
"""Does the Marital Status effect a customers purchase?"""
plt.figure(figsize=(17, 75))
for i in range(len(num_cols)):
plt.subplot(18, 3, i + 1)
sns.violinplot(data=df_eda, x='MaritalStatus', y=num_cols[i], hue="ProdTaken", split=True)
plt.tight_layout()
plt.title(num_cols[i], fontsize=25)
sns.despine()
"""Does the ownership of a passport influence a customers purchase?"""
plt.figure(figsize=(17, 75))
for i in range(len(num_cols)):
plt.subplot(18, 3, i + 1)
sns.violinplot(data=df_eda, x='Passport', y=num_cols[i], hue="ProdTaken", split=True)
plt.tight_layout()
plt.title(num_cols[i], fontsize=25)
sns.despine()
"""Does the designation of a customers job influence their purchase?"""
plt.figure(figsize=(17, 75))
for i in range(len(num_cols)):
plt.subplot(18, 3, i + 1)
sns.violinplot(data=df_eda, x='Designation', y=num_cols[i], hue="ProdTaken", split=True)
plt.tight_layout()
plt.title(num_cols[i], fontsize=25)
sns.despine()
"""Does the occupation influence a customers purchase?"""
plt.figure(figsize=(17, 75))
for i in range(len(num_cols)):
plt.subplot(18, 3, i + 1)
b sns.violinplot(data=df_eda, x='Occupation', y=num_cols[i], hue="ProdTaken", split=True)
plt.tight_layout()
plt.title(num_cols[i], fontsize=25)
sns.despine()
"""Does the whether the company contacted a customers influence their purchase?"""
plt.figure(figsize=(17, 75))
for i in range(len(num_cols)):
plt.subplot(18, 3, i + 1)
sns.violinplot(data=df_eda, x='TypeofContact', y=num_cols[i], hue="ProdTaken", split=True)
plt.tight_layout()
plt.title(num_cols[i], fontsize=25)
sns.despine()
# Assess all relationships with regards to the target variable (Personal Loan)
num_cols = df_eda.select_dtypes(include=np.number).columns.tolist()
num_cols.remove("ProdTaken")
plt.figure(figsize=(17, 75))
for i in range(len(num_cols)):
plt.subplot(18, 3, i + 1)
sns.regplot(data=df_eda, x = num_cols[i], y = "ProdTaken", logistic=True, ci=None)
plt.tight_layout()
plt.title(num_cols[i], fontsize=25)
sns.despine()
plt.show()
The trends on the data with regards to having logistical regression trends is either absent or weak across the variables. Decision Tree Models will be used for this analysis.
Recall is the primary target metric for these models. This will optimize for ensuring a customer likely to purchase a plan is pursued, as opposed to missing out on some of those potential customers in favor of being more confident each customer we contact will purchase a travel plan.
# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn(model, predictors, target):
"""
Function to compute different metrics to check classification model performance
model: classifier
predictors: independent variables
target: dependent variable
"""
# predicting using the independent variables
pred = model.predict(predictors)
acc = metrics.accuracy_score(target, pred) # to compute Accuracy
recall = metrics.recall_score(target, pred) # to compute Recall
precision = metrics.precision_score(target, pred) # to compute Precision
f1 = metrics.f1_score(target, pred) # to compute F1-score
# creating a dataframe of metrics
df_perf = pd.DataFrame(
{
"Accuracy": acc,
"Recall": recall,
"Precision": precision,
"F1": f1,
},
index=[0],
)
return df_perf
def confusion_matrix_sklearn(model, predictors, target):
"""
To plot the confusion_matrix with percentages
model: classifier
predictors: independent variables
target: dependent variable
"""
y_pred = model.predict(predictors)
cm = metrics.confusion_matrix(target, y_pred)
labels = np.asarray(
[
["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=labels, fmt="")
plt.ylabel("True label")
plt.xlabel("Predicted label")
df_model = df_eda.copy() #create new dataframe for modeling
"""Prepare dataset for modeling and split into testing and training"""
df_model = pd.get_dummies(df_model, drop_first=True) # One-hot encode all categorical variables
X = df_model.drop('ProdTaken', axis=1) # Separate data from target variable
Y = df_model.ProdTaken # Extract target variable
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42) # Split dataset for training and testing
# initialize dataframes for training and test performance data
model_train_perf = pd.DataFrame(index=["Accuracy", "Recall", "Precision", "F1"])
model_test_perf = pd.DataFrame(index=["Accuracy", "Recall", "Precision", "F1"])
"""Initialize and run basic decision tree model model"""
dtree = tree.DecisionTreeClassifier(criterion="gini", class_weight={0:0.19, 1:0.81}, random_state=42)
dtree.fit(x_train, y_train)
DecisionTreeClassifier(class_weight={0: 0.19, 1: 0.81}, random_state=42)
confusion_matrix_sklearn(dtree, x_test, y_test)
model_train_perf['Decision Tree'] = model_performance_classification_sklearn(dtree, x_train, y_train).T
model_performance_classification_sklearn(dtree, x_train, y_train)
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.0 | 1.0 | 1.0 | 1.0 |
model_test_perf['Decision Tree'] = model_performance_classification_sklearn(dtree, x_test, y_test).T
model_performance_classification_sklearn(dtree, x_test, y_test)
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.887526 | 0.711679 | 0.69395 | 0.702703 |
"""Run Decision tree model agian, but with pre-pruning"""
est = tree.DecisionTreeClassifier(criterion='gini', class_weight={0:0.19, 1:0.81}, random_state=42) # Choose classifier type for grid
# Set the grid parameters to test
params = {
"max_depth": np.arange(2, 8),
"min_samples_leaf": np.arange(1, 6),
"max_leaf_nodes":np.arange(8, 16)
}
# Score to compare params on
acc_scorer = metrics.make_scorer(metrics.recall_score)
#Run grid search
grid = GridSearchCV(est, params, scoring=acc_scorer, cv=5)
grid = grid.fit(x_train, y_train)
est = grid.best_estimator_ # Set clf to best combination of params
est.fit(x_train, y_train) # Fit best algorithm
DecisionTreeClassifier(class_weight={0: 0.19, 1: 0.81}, max_depth=6,
max_leaf_nodes=13, random_state=42)
confusion_matrix_sklearn(est, x_train, y_train)
model_train_perf["Decision Tree Pre-pruned"] = model_performance_classification_sklearn(est, x_train, y_train).T
model_performance_classification_sklearn(est, x_train, y_train)
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.749196 | 0.741486 | 0.409402 | 0.527533 |
model_test_perf["Decision Tree Pre-pruned"] = model_performance_classification_sklearn(est, x_test, y_test).T
model_performance_classification_sklearn(est, x_test, y_test)
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.725971 | 0.689781 | 0.373518 | 0.484615 |
"""Prune tree based on performance in Recall, post-pruning"""
clf = tree.DecisionTreeClassifier(criterion='gini', class_weight={0:0.19, 1:0.81}, random_state=42) # Initialize new tree model
path = clf.cost_complexity_pruning_path(x_train, y_train) # Create pruning path based on training dataset
ccp_alphas, impurities = path.ccp_alphas, path.impurities # Extract alphas and impurites
# Run decision trees with each alpha
clfs = []
for i in ccp_alphas:
clf = tree.DecisionTreeClassifier(criterion='gini', class_weight={0:0.19, 1:0.81}, random_state=42, ccp_alpha = i)
clf.fit(x_train, y_train)
clfs.append(clf)
print(f"Number of nodes in the last tree is {clfs[-1].tree_.node_count} with a ccp_alpha of {ccp_alphas[-1]}")
train_recalls = []
for i in clfs:
pred_train = i.predict(x_train)
values_train = metrics.recall_score(y_train, pred_train)
train_recalls.append(values_train)
test_recalls = []
for i in clfs:
pred_test = i.predict(x_test)
values_test = metrics.recall_score(y_test, pred_test)
test_recalls.append(values_test)
Number of nodes in the last tree is 1 with a ccp_alpha of 0.050821443917997144
fig, ax = plt.subplots(figsize=(15, 5))
ax.plot(ccp_alphas, train_recalls, marker='o', label='Train', drawstyle='steps-post')
ax.plot(ccp_alphas, test_recalls, marker='o', label='Test', drawstyle='steps-post')
ax.set_xlabel("alpha")
ax.set_ylabel("Recall")
ax.set_title("Recall vs alpha")
ax.legend()
plt.show()
alpha_diffs = np.subtract(train_recalls, test_recalls) # Find difference between alphas
diffs = pd.Series(alpha_diffs)
idx = diffs[diffs < 0.1].index # Find indices for values with low difference, testing performs only slightly better on Recall than training
idx
train_recalls = np.array(train_recalls)
test_recalls = np.array(test_recalls)
pd.DataFrame([train_recalls[idx], test_recalls[idx], ccp_alphas[idx]], index=["Train", "Test", "Alpha"], columns=idx) # Format results for comparison
| 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Train | 0.859133 | 0.859133 | 0.856037 | 0.851393 | 0.834365 | 0.834365 | 0.814241 | 0.814241 | 0.823529 | 0.840557 | 0.825077 | 0.825077 | 0.829721 | 0.835913 | 0.828173 | 0.801858 | 0.811146 | 0.744582 | 0.747678 | 0.727554 | 0.727554 | 0.716718 | 0.682663 | 0.696594 | 0.606811 | 0.606811 | 0.642415 | 0.693498 | 0.537152 | 0.537152 | 0.000000 |
| Test | 0.770073 | 0.770073 | 0.770073 | 0.770073 | 0.755474 | 0.755474 | 0.740876 | 0.740876 | 0.748175 | 0.777372 | 0.762774 | 0.766423 | 0.770073 | 0.784672 | 0.781022 | 0.766423 | 0.781022 | 0.704380 | 0.711679 | 0.689781 | 0.689781 | 0.678832 | 0.642336 | 0.642336 | 0.598540 | 0.598540 | 0.616788 | 0.671533 | 0.536496 | 0.536496 | 0.000000 |
| Alpha | 0.001862 | 0.001903 | 0.001944 | 0.001979 | 0.001981 | 0.002027 | 0.002115 | 0.002129 | 0.002222 | 0.002247 | 0.002480 | 0.002525 | 0.002662 | 0.002786 | 0.002877 | 0.002941 | 0.003017 | 0.003157 | 0.003418 | 0.003446 | 0.003965 | 0.004078 | 0.004583 | 0.004615 | 0.004993 | 0.006608 | 0.006807 | 0.010736 | 0.012438 | 0.019092 | 0.050821 |
opt_dtree = clfs[249]
opt_dtree
DecisionTreeClassifier(ccp_alpha=0.002785743395895813,
class_weight={0: 0.19, 1: 0.81}, random_state=42)
confusion_matrix_sklearn(opt_dtree, x_test, y_test)
model_train_perf["Decision Tree Post-pruned"] = model_performance_classification_sklearn(opt_dtree, x_train, y_train).T
model_performance_classification_sklearn(opt_dtree, x_train, y_train)
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.756212 | 0.835913 | 0.425868 | 0.564263 |
model_test_perf["Decision Tree Post-pruned"] = model_performance_classification_sklearn(opt_dtree, x_test, y_test).T
model_performance_classification_sklearn(opt_dtree, x_test, y_test)
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.734151 | 0.784672 | 0.393773 | 0.52439 |
"""Examine the important features for this model"""
feature_names = x_train.columns
importances = opt_dtree.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], align='center')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
baggins = BaggingClassifier(random_state=42)
baggins.fit(x_train, y_train)
BaggingClassifier(random_state=42)
confusion_matrix_sklearn(baggins, x_test, y_test)
model_train_perf["Bagging"] = model_performance_classification_sklearn(baggins, x_train, y_train).T
model_performance_classification_sklearn(baggins, x_train, y_train)
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.992985 | 0.962848 | 1.0 | 0.981073 |
model_test_perf["Bagging"] = model_performance_classification_sklearn(baggins, x_test, y_test).T
model_performance_classification_sklearn(baggins, x_test, y_test)
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.903885 | 0.587591 | 0.851852 | 0.695464 |
baggins_wt = BaggingClassifier(base_estimator=tree.DecisionTreeClassifier(criterion='gini', class_weight={0:0.19, 1:0.81}, random_state=42))
baggins_wt.fit(x_train, y_train)
BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight={0: 0.19,
1: 0.81},
random_state=42))
confusion_matrix_sklearn(baggins_wt, x_test, y_test)
model_train_perf["Bagging Weighted"] = model_performance_classification_sklearn(baggins_wt, x_train, y_train).T
model_performance_classification_sklearn(baggins_wt, x_train, y_train)
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.991815 | 0.958204 | 0.998387 | 0.977883 |
model_test_perf["Bagging Weighted"] = model_performance_classification_sklearn(baggins_wt, x_test, y_test).T
model_performance_classification_sklearn(baggins_wt, x_test, y_test)
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.909339 | 0.59854 | 0.877005 | 0.711497 |
# Initial Parameter search
tree_class = tree.DecisionTreeClassifier(criterion='gini', class_weight={0:0.19, 1:0.81}, random_state=42)
params = {
'base_estimator':[tree_class],
'n_estimators': [5, 10, 15, 20, 25, 50, 100, 150],
'max_features': [0.5, 0.6, 0.7, 0.8, 0.9, 1]
}
grid = GridSearchCV(BaggingClassifier(random_state=42, bootstrap=True), param_grid=params, scoring='recall', cv=5)
grid.fit(x_train, y_train)
grid.best_estimator_
BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight={0: 0.19,
1: 0.81},
random_state=42),
max_features=0.9, n_estimators=15, random_state=42)
# Fine tune parameter search for n_estimators
tree_class = tree.DecisionTreeClassifier(criterion='gini', class_weight={0:0.19, 1:0.81}, random_state=42)
params = {
'base_estimator':[tree_class],
'n_estimators': np.arange(10, 21),
'max_features': [0.5, 0.6, 0.7, 0.8, 0.9, 1]
}
grid = GridSearchCV(BaggingClassifier(random_state=42, bootstrap=True), param_grid=params, scoring='recall', cv=5)
grid.fit(x_train, y_train)
grid.best_estimator_
BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight={0: 0.19,
1: 0.81},
random_state=42),
max_features=0.9, n_estimators=13, random_state=42)
# Implement best estimated model
baggins_est = grid.best_estimator_
baggins_est.fit(x_train, y_train)
BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight={0: 0.19,
1: 0.81},
random_state=42),
max_features=0.9, n_estimators=13, random_state=42)
confusion_matrix_sklearn(baggins_est, x_test, y_test)
model_train_perf["Bagging Tuned"] = model_performance_classification_sklearn(baggins_est, x_train, y_train).T
model_performance_classification_sklearn(baggins_est, x_train, y_train)
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.995908 | 0.979876 | 0.998423 | 0.989062 |
model_test_perf["Bagging Tuned"] = model_performance_classification_sklearn(baggins_est, x_test, y_test).T
model_performance_classification_sklearn(baggins_est, x_test, y_test)
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.919564 | 0.656934 | 0.882353 | 0.753138 |
ents = RandomForestClassifier(random_state=42)
ents.fit(x_train, y_train)
RandomForestClassifier(random_state=42)
confusion_matrix_sklearn(ents, x_test, y_test)
model_train_perf["Random Forest"] = model_performance_classification_sklearn(ents, x_train, y_train).T
model_performance_classification_sklearn(ents, x_train, y_train)
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.0 | 1.0 | 1.0 | 1.0 |
model_test_perf["Random Forest"] = model_performance_classification_sklearn(ents, x_test, y_test).T
model_performance_classification_sklearn(ents, x_test, y_test)
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.912065 | 0.587591 | 0.909605 | 0.713969 |
ents_wt = RandomForestClassifier(class_weight={0:0.19, 1:0.81}, random_state=42)
ents_wt.fit(x_train, y_train)
RandomForestClassifier(class_weight={0: 0.19, 1: 0.81}, random_state=42)
confusion_matrix_sklearn(ents_wt, x_test, y_test)
model_train_perf["Random Forest Weighted"] = model_performance_classification_sklearn(ents_wt, x_train, y_train).T
model_performance_classification_sklearn(ents_wt, x_train, y_train)
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.0 | 1.0 | 1.0 | 1.0 |
model_test_perf["Random Forest Weighted"] = model_performance_classification_sklearn(ents_wt, x_test, y_test).T
model_performance_classification_sklearn(ents_wt, x_test, y_test)
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.907975 | 0.551095 | 0.92638 | 0.691076 |
ent_est = RandomForestClassifier(class_weight={0:0.19, 1:0.81}, random_state=42)
params = {
"n_estimators": [5, 10, 15, 20, 25, 50, 100, 150],
"min_samples_leaf": np.arange(1, 5),
"max_features": [0.5, 0.6, 0.7, 0.8, 0.9, 1],
"max_samples": [0.5, 0.6, 0.7, 0.8, 0.9, 1]
}
grid = GridSearchCV(ent_est, params, scoring='recall', cv=5)
grid = grid.fit(x_train, y_train)
beep(sound=1)
RandomForestClassifier(class_weight={0: 0.19, 1: 0.81}, max_features=0.9,
max_samples=0.9, min_samples_leaf=4, n_estimators=15,
random_state=42)
grid.best_estimator_
ent_est = RandomForestClassifier(class_weight={0:0.19, 1:0.81}, random_state=42)
params = {
"n_estimators": np.arange(10, 21),
"min_samples_leaf": np.arange(1, 5),
"max_features": [0.5, 0.6, 0.7, 0.8, 0.9, 1],
"max_samples": [0.5, 0.6, 0.7, 0.8, 0.9, 1]
}
grid = GridSearchCV(ent_est, params, scoring='recall', cv=5)
grid = grid.fit(x_train, y_train)
beep(sound=1)
RandomForestClassifier(class_weight={0: 0.19, 1: 0.81}, max_features=0.9,
max_samples=0.9, min_samples_leaf=4, n_estimators=13,
random_state=42)
grid.best_estimator_
# Implement best estimated model
ent_est = grid.best_estimator_
ent_est.fit(x_train, y_train)
RandomForestClassifier(class_weight={0: 0.19, 1: 0.81}, max_features=0.9,
max_samples=0.9, min_samples_leaf=4, n_estimators=13,
random_state=42)
confusion_matrix_sklearn(ent_est, x_test,y_test)
model_train_perf["Random Forest Tuned"] = model_performance_classification_sklearn(ent_est, x_train, y_train).T
model_performance_classification_sklearn(ent_est, x_train, y_train)
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.961707 | 0.944272 | 0.865248 | 0.903035 |
model_test_perf["Random Forest Tuned"] = model_performance_classification_sklearn(ent_est, x_test, y_test).T
model_performance_classification_sklearn(ent_est, x_test, y_test)
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.881391 | 0.653285 | 0.693798 | 0.672932 |
"""Examine the important features for this model"""
feature_names = x_train.columns
importances = ent_est.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], align='center')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
ab_class = AdaBoostClassifier(random_state=42)
ab_class.fit(x_train, y_train)
AdaBoostClassifier(random_state=42)
confusion_matrix_sklearn(ab_class, x_test,y_test)
model_train_perf["AdaBoost Classifier"] = model_performance_classification_sklearn(ab_class, x_train, y_train).T
model_performance_classification_sklearn(ab_class, x_train, y_train)
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.854721 | 0.365325 | 0.73065 | 0.4871 |
model_test_perf["AdaBoost Classifier"] = model_performance_classification_sklearn(ab_class, x_test, y_test).T
model_performance_classification_sklearn(ab_class, x_test, y_test)
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.838446 | 0.310219 | 0.639098 | 0.41769 |
ab_tuned = AdaBoostClassifier(random_state=42)
params = {
"n_estimators": np.arange(10, 21),
"learning_rate": [1, 0.5, 0.1, 0.01]
}
grid = GridSearchCV(ab_tuned, params, scoring='recall', cv=5)
grid = grid.fit(x_train, y_train)
beep(sound=1)
AdaBoostClassifier(learning_rate=1, n_estimators=13, random_state=42)
grid.best_estimator_
ab_tuned = grid.best_estimator_
ab_tuned.fit(x_train, y_train)
AdaBoostClassifier(learning_rate=1, n_estimators=13, random_state=42)
confusion_matrix_sklearn(ab_tuned, x_test,y_test)
model_train_perf["AdaBoost Tuned"] = model_performance_classification_sklearn(ab_tuned, x_train, y_train).T
model_performance_classification_sklearn(ab_tuned, x_train, y_train)
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.840398 | 0.26161 | 0.710084 | 0.382353 |
model_test_perf["AdaBoost Tuned"] = model_performance_classification_sklearn(ab_tuned, x_test, y_test).T
model_performance_classification_sklearn(ab_tuned, x_test, y_test)
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.826176 | 0.244526 | 0.582609 | 0.344473 |
"""Examine the important features for this model"""
feature_names = x_train.columns
importances = ab_tuned.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], align='center')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
gb_est = GradientBoostingClassifier(random_state=42)
gb_est.fit(x_train, y_train)
GradientBoostingClassifier(random_state=42)
confusion_matrix_sklearn(gb_est, x_test,y_test)
model_train_perf["Gradient Boost"] = model_performance_classification_sklearn(gb_est, x_train, y_train).T
model_performance_classification_sklearn(gb_est, x_train, y_train)
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.888337 | 0.47678 | 0.875 | 0.617234 |
model_test_perf["Gradient Boost"] = model_performance_classification_sklearn(gb_est, x_test, y_test).T
model_performance_classification_sklearn(gb_est, x_test, y_test)
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.861622 | 0.368613 | 0.770992 | 0.498765 |
gb_tuned = GradientBoostingClassifier(random_state=42)
params = {
"n_estimators": np.arange(10, 21),
"subsample": [0.5, 0.6, 0.7, 0.8, 0.9, 1],
"max_features": [0.5, 0.6, 0.7, 0.8, 0.9, 1],
"max_depth": np.arange(2, 12, 2)
}
grid = GridSearchCV(gb_tuned, params, scoring='recall', cv=5)
grid = grid.fit(x_train, y_train)
beep(sound=7)
GradientBoostingClassifier(max_depth=10, max_features=0.9, n_estimators=20,
random_state=42, subsample=1)
grid.best_estimator_
gb_tuned = GradientBoostingClassifier(random_state=42)
params = {
"n_estimators": [20, 25, 50, 100, 150, 500],
"subsample": [0.9, 1],
"max_features": [0.8, 0.9, 1],
"max_depth": np.arange(8, 12, 1)
}
grid = GridSearchCV(gb_tuned, params, scoring='recall', cv=5)
grid = grid.fit(x_train, y_train)
beep(sound=7)
grid.best_estimator_
GradientBoostingClassifier(max_depth=9, max_features=0.8, random_state=42,
subsample=1)
gb_tuned = grid.best_estimator_
gb_tuned.fit(x_train, y_train)
GradientBoostingClassifier(max_depth=9, max_features=0.8, random_state=42,
subsample=1)
confusion_matrix_sklearn(gb_tuned, x_test,y_test)
model_train_perf["Gradient Boost Tuned"] = model_performance_classification_sklearn(gb_tuned, x_train, y_train).T
model_performance_classification_sklearn(gb_tuned, x_train, y_train)
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.0 | 1.0 | 1.0 | 1.0 |
model_test_perf["Gradient Boost Tuned"] = model_performance_classification_sklearn(gb_tuned, x_test, y_test).T
model_performance_classification_sklearn(gb_tuned, x_test, y_test)
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.931152 | 0.70438 | 0.906103 | 0.792608 |
"""Examine the important features for this model"""
feature_names = x_train.columns
importances = gb_tuned.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], align='center')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
xgb_est = XGBClassifier(random_state=42, eval_metric='logloss')
xgb_est.fit(x_train, y_train)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
gamma=0, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=4,
num_parallel_tree=1, random_state=42, reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, subsample=1, tree_method='exact',
validate_parameters=1, verbosity=None)
confusion_matrix_sklearn(xgb_est, x_test,y_test)
model_train_perf["XGBoost"] = model_performance_classification_sklearn(xgb_est, x_train, y_train).T
model_performance_classification_sklearn(xgb_est, x_train, y_train)
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.999708 | 0.998452 | 1.0 | 0.999225 |
model_test_perf["XGBoost"] = model_performance_classification_sklearn(xgb_est, x_test, y_test).T
model_performance_classification_sklearn(xgb_est, x_test, y_test)
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.93456 | 0.722628 | 0.908257 | 0.804878 |
xgb_tuned = XGBClassifier(random_state=42, eval_metric='logloss')
params = {
"n_estimators": np.arange(10, 21),
"subsample": [0.5, 0.6, 0.7, 0.8, 0.9, 1],
"gamma": [0, 1, 3, 5],
"colsample_bytree":[0.5, 0.6, 0.7, 0.8, 0.9, 1],
"colsample_bylevel":[0.5, 0.6, 0.7, 0.8, 0.9, 1]
}
grid = GridSearchCV(xgb_tuned, params, scoring='recall', cv=5)
grid = grid.fit(x_train, y_train)
beep(sound=7)
grid.best_estimator_
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
gamma=0, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=20, n_jobs=4,
num_parallel_tree=1, random_state=42, reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, subsample=1, tree_method='exact',
validate_parameters=1, verbosity=None)
xgb_tuned = XGBClassifier(random_state=42, eval_metric='logloss')
params = {
"n_estimators": [20, 25, 50, 100, 150, 500],
"subsample": [0.8, 0.9, 1],
"gamma": [0, 1, 2],
"colsample_bytree":[0.8, 0.9, 1],
"colsample_bylevel":[0.8, 0.9, 1]
}
grid = GridSearchCV(xgb_tuned, params, scoring='recall', cv=5)
grid = grid.fit(x_train, y_train)
beep(sound=7)
grid.best_estimator_
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.8,
colsample_bynode=1, colsample_bytree=0.9, eval_metric='logloss',
gamma=0, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=500, n_jobs=4,
num_parallel_tree=1, random_state=42, reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, subsample=1, tree_method='exact',
validate_parameters=1, verbosity=None)
xgb_tuned = grid.best_estimator_
xgb_tuned.fit(x_train, y_train)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.8,
colsample_bynode=1, colsample_bytree=0.9, eval_metric='logloss',
gamma=0, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=500, n_jobs=4,
num_parallel_tree=1, random_state=42, reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, subsample=1, tree_method='exact',
validate_parameters=1, verbosity=None)
confusion_matrix_sklearn(xgb_tuned, x_test,y_test)
model_train_perf["XGBoost Tuned"] = model_performance_classification_sklearn(xgb_tuned, x_train, y_train).T
model_performance_classification_sklearn(xgb_tuned, x_train, y_train)
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.0 | 1.0 | 1.0 | 1.0 |
model_test_perf["XGBoost Tuned"] = model_performance_classification_sklearn(xgb_tuned, x_test, y_test).T
model_performance_classification_sklearn(xgb_tuned, x_test, y_test)
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.932515 | 0.733577 | 0.885463 | 0.802395 |
"""Examine the important features for this model"""
feature_names = x_train.columns
importances = xgb_tuned.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], align='center')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
Examine models up to this point to decide which to stack
model_train_perf
| Decision Tree | Decision Tree Pre-pruned | Decision Tree Post-pruned | Bagging | Bagging Weighted | Bagging Tuned | Random Forest | Random Forest Weighted | Random Forest Tuned | AdaBoost Classifier | AdaBoost Tuned | Gradient Boost | Gradient Boost Tuned | XGBoost | XGBoost Tuned | Stacking | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Accuracy | 1.0 | 0.749196 | 0.756212 | 0.992985 | 0.991815 | 0.995908 | 1.0 | 1.0 | 0.961707 | 0.854721 | 0.840398 | 0.888337 | 1.0 | 0.999708 | 1.0 | 0.942999 |
| Recall | 1.0 | 0.741486 | 0.835913 | 0.962848 | 0.958204 | 0.979876 | 1.0 | 1.0 | 0.944272 | 0.365325 | 0.261610 | 0.476780 | 1.0 | 0.998452 | 1.0 | 0.808050 |
| Precision | 1.0 | 0.409402 | 0.425868 | 1.000000 | 0.998387 | 0.998423 | 1.0 | 1.0 | 0.865248 | 0.730650 | 0.710084 | 0.875000 | 1.0 | 1.000000 | 1.0 | 0.880270 |
| F1 | 1.0 | 0.527533 | 0.564263 | 0.981073 | 0.977883 | 0.989062 | 1.0 | 1.0 | 0.903035 | 0.487100 | 0.382353 | 0.617234 | 1.0 | 0.999225 | 1.0 | 0.842615 |
model_test_perf
| Decision Tree | Decision Tree Pre-pruned | Decision Tree Post-pruned | Bagging | Bagging Weighted | Bagging Tuned | Random Forest | Random Forest Weighted | Random Forest Tuned | AdaBoost Classifier | AdaBoost Tuned | Gradient Boost | Gradient Boost Tuned | XGBoost | XGBoost Tuned | Stacking | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Accuracy | 0.887526 | 0.725971 | 0.734151 | 0.903885 | 0.909339 | 0.919564 | 0.912065 | 0.907975 | 0.881391 | 0.838446 | 0.826176 | 0.861622 | 0.931152 | 0.934560 | 0.932515 | 0.890934 |
| Recall | 0.711679 | 0.689781 | 0.784672 | 0.587591 | 0.598540 | 0.656934 | 0.587591 | 0.551095 | 0.653285 | 0.310219 | 0.244526 | 0.368613 | 0.704380 | 0.722628 | 0.733577 | 0.602190 |
| Precision | 0.693950 | 0.373518 | 0.393773 | 0.851852 | 0.877005 | 0.882353 | 0.909605 | 0.926380 | 0.693798 | 0.639098 | 0.582609 | 0.770992 | 0.906103 | 0.908257 | 0.885463 | 0.763889 |
| F1 | 0.702703 | 0.484615 | 0.524390 | 0.695464 | 0.711497 | 0.753138 | 0.713969 | 0.691076 | 0.672932 | 0.417690 | 0.344473 | 0.498765 | 0.792608 | 0.804878 | 0.802395 | 0.673469 |
The models with the least overfitting for each type of model that can work in for Stacking are:
These models will be included in the stacking model with the XGBoost as the final estimator.
estimators = [('Decision Tree', opt_dtree), ('Random Forest', ent_est), ('Gradient Boosting', gb_est)]
final_est = XGBClassifier(random_state=42, eval_metric='logloss')
stacking_est = StackingClassifier(estimators=estimators, final_estimator=final_est, cv=5)
stacking_est.fit(x_train, y_train)
StackingClassifier(cv=5,
estimators=[('Decision Tree',
DecisionTreeClassifier(ccp_alpha=0.002785743395895813,
class_weight={0: 0.19,
1: 0.81},
random_state=42)),
('Random Forest',
RandomForestClassifier(class_weight={0: 0.19,
1: 0.81},
max_features=0.9,
max_samples=0.9,
min_samples_leaf=4,
n_estimators=13,
random_state=42)),
('Gradient Boosting',
GradientBoostingClassif...
importance_type='gain',
interaction_constraints=None,
learning_rate=None,
max_delta_step=None,
max_depth=None,
min_child_weight=None,
missing=nan,
monotone_constraints=None,
n_estimators=100, n_jobs=None,
num_parallel_tree=None,
random_state=42,
reg_alpha=None,
reg_lambda=None,
scale_pos_weight=None,
subsample=None,
tree_method=None,
validate_parameters=None,
verbosity=None))
confusion_matrix_sklearn(stacking_est, x_test,y_test)
model_train_perf["Stacking"] = model_performance_classification_sklearn(stacking_est, x_train, y_train).T
model_performance_classification_sklearn(stacking_est, x_train, y_train)
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.942999 | 0.80805 | 0.88027 | 0.842615 |
model_test_perf["Stacking"] = model_performance_classification_sklearn(stacking_est, x_test, y_test).T
model_performance_classification_sklearn(stacking_est, x_test, y_test)
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.890934 | 0.60219 | 0.763889 | 0.673469 |
model_train_perf
| Decision Tree | Decision Tree Pre-pruned | Decision Tree Post-pruned | Bagging | Bagging Weighted | Bagging Tuned | Random Forest | Random Forest Weighted | Random Forest Tuned | AdaBoost Classifier | AdaBoost Tuned | Gradient Boost | Gradient Boost Tuned | XGBoost | XGBoost Tuned | Stacking | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Accuracy | 1.0 | 0.749196 | 0.756212 | 0.992985 | 0.991815 | 0.995908 | 1.0 | 1.0 | 0.961707 | 0.854721 | 0.840398 | 0.888337 | 1.0 | 0.999708 | 1.0 | 0.942999 |
| Recall | 1.0 | 0.741486 | 0.835913 | 0.962848 | 0.958204 | 0.979876 | 1.0 | 1.0 | 0.944272 | 0.365325 | 0.261610 | 0.476780 | 1.0 | 0.998452 | 1.0 | 0.808050 |
| Precision | 1.0 | 0.409402 | 0.425868 | 1.000000 | 0.998387 | 0.998423 | 1.0 | 1.0 | 0.865248 | 0.730650 | 0.710084 | 0.875000 | 1.0 | 1.000000 | 1.0 | 0.880270 |
| F1 | 1.0 | 0.527533 | 0.564263 | 0.981073 | 0.977883 | 0.989062 | 1.0 | 1.0 | 0.903035 | 0.487100 | 0.382353 | 0.617234 | 1.0 | 0.999225 | 1.0 | 0.842615 |
model_test_perf
| Decision Tree | Decision Tree Pre-pruned | Decision Tree Post-pruned | Bagging | Bagging Weighted | Bagging Tuned | Random Forest | Random Forest Weighted | Random Forest Tuned | AdaBoost Classifier | AdaBoost Tuned | Gradient Boost | Gradient Boost Tuned | XGBoost | XGBoost Tuned | Stacking | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Accuracy | 0.887526 | 0.725971 | 0.734151 | 0.903885 | 0.909339 | 0.919564 | 0.912065 | 0.907975 | 0.881391 | 0.838446 | 0.826176 | 0.861622 | 0.931152 | 0.934560 | 0.932515 | 0.890934 |
| Recall | 0.711679 | 0.689781 | 0.784672 | 0.587591 | 0.598540 | 0.656934 | 0.587591 | 0.551095 | 0.653285 | 0.310219 | 0.244526 | 0.368613 | 0.704380 | 0.722628 | 0.733577 | 0.602190 |
| Precision | 0.693950 | 0.373518 | 0.393773 | 0.851852 | 0.877005 | 0.882353 | 0.909605 | 0.926380 | 0.693798 | 0.639098 | 0.582609 | 0.770992 | 0.906103 | 0.908257 | 0.885463 | 0.763889 |
| F1 | 0.702703 | 0.484615 | 0.524390 | 0.695464 | 0.711497 | 0.753138 | 0.713969 | 0.691076 | 0.672932 | 0.417690 | 0.344473 | 0.498765 | 0.792608 | 0.804878 | 0.802395 | 0.673469 |
Of all the models the Decision Tree Tuned had the greatest recall with a rate of 78.5%. This is combined with a slight overfitting with a recall on the training of 83.6%. However, the overfit is slight compared to most models. It does suffer compared to others with regards to Accuracy (73.4%) and Precision (39.4%). Therefore, the model is actually not very good at knowing if a customer will not purchase a package, but it is better than most at identifying those who will.
print(tree.export_text(opt_dtree, feature_names=X.columns.to_list(), show_weights=True))
|--- Passport <= 0.50 | |--- MaritalStatus_Single <= 0.50 | | |--- PreferredPropertyStar <= 4.50 | | | |--- NumberOfFollowups <= 5.50 | | | | |--- DurationOfPitch <= 13.50 | | | | | |--- weights: [150.86, 36.45] class: 0 | | | | |--- DurationOfPitch > 13.50 | | | | | |--- ProductPitched_Standard <= 0.50 | | | | | | |--- Age <= 22.50 | | | | | | | |--- weights: [1.52, 4.86] class: 1 | | | | | | |--- Age > 22.50 | | | | | | | |--- Occupation_Large Business <= 0.50 | | | | | | | | |--- weights: [91.01, 22.68] class: 0 | | | | | | | |--- Occupation_Large Business > 0.50 | | | | | | | | |--- weights: [8.55, 9.72] class: 1 | | | | | |--- ProductPitched_Standard > 0.50 | | | | | | |--- weights: [19.57, 20.25] class: 1 | | | |--- NumberOfFollowups > 5.50 | | | | |--- weights: [7.60, 10.53] class: 1 | | |--- PreferredPropertyStar > 4.50 | | | |--- Designation_Executive <= 0.50 | | | | |--- OwnCar <= 0.50 | | | | | |--- DurationOfPitch <= 14.50 | | | | | | |--- weights: [9.31, 1.62] class: 0 | | | | | |--- DurationOfPitch > 14.50 | | | | | | |--- weights: [4.56, 16.20] class: 1 | | | | |--- OwnCar > 0.50 | | | | | |--- weights: [32.68, 7.29] class: 0 | | | |--- Designation_Executive > 0.50 | | | | |--- DurationOfPitch <= 7.50 | | | | | |--- weights: [4.56, 0.00] class: 0 | | | | |--- DurationOfPitch > 7.50 | | | | | |--- weights: [12.54, 30.78] class: 1 | |--- MaritalStatus_Single > 0.50 | | |--- Designation_Executive <= 0.50 | | | |--- MonthlyIncome <= 21126.50 | | | | |--- weights: [12.54, 13.77] class: 1 | | | |--- MonthlyIncome > 21126.50 | | | | |--- TypeofContact_Self Enquiry <= 0.50 | | | | | |--- NumberOfTrips <= 3.50 | | | | | | |--- weights: [5.32, 0.00] class: 0 | | | | | |--- NumberOfTrips > 3.50 | | | | | | |--- weights: [0.19, 4.05] class: 1 | | | | |--- TypeofContact_Self Enquiry > 0.50 | | | | | |--- weights: [16.34, 0.81] class: 0 | | |--- Designation_Executive > 0.50 | | | |--- Age <= 32.50 | | | | |--- weights: [14.06, 46.17] class: 1 | | | |--- Age > 32.50 | | | | |--- Age <= 53.50 | | | | | |--- PreferredPropertyStar <= 4.50 | | | | | | |--- weights: [14.63, 4.86] class: 0 | | | | | |--- PreferredPropertyStar > 4.50 | | | | | | |--- weights: [1.52, 7.29] class: 1 | | | | |--- Age > 53.50 | | | | | |--- weights: [0.00, 4.86] class: 1 |--- Passport > 0.50 | |--- Designation_Executive <= 0.50 | | |--- CityTier <= 1.50 | | | |--- DurationOfPitch <= 34.00 | | | | |--- PitchSatisfactionScore <= 4.50 | | | | | |--- Age <= 54.50 | | | | | | |--- weights: [40.47, 3.24] class: 0 | | | | | |--- Age > 54.50 | | | | | | |--- weights: [3.80, 5.67] class: 1 | | | | |--- PitchSatisfactionScore > 4.50 | | | | | |--- NumberOfTrips <= 2.50 | | | | | | |--- weights: [2.47, 10.53] class: 1 | | | | | |--- NumberOfTrips > 2.50 | | | | | | |--- weights: [6.27, 1.62] class: 0 | | | |--- DurationOfPitch > 34.00 | | | | |--- weights: [0.57, 5.67] class: 1 | | |--- CityTier > 1.50 | | | |--- MaritalStatus_Married <= 0.50 | | | | |--- weights: [14.25, 51.84] class: 1 | | | |--- MaritalStatus_Married > 0.50 | | | | |--- DurationOfPitch <= 18.00 | | | | | |--- weights: [17.48, 7.29] class: 0 | | | | |--- DurationOfPitch > 18.00 | | | | | |--- weights: [3.04, 15.39] class: 1 | |--- Designation_Executive > 0.50 | | |--- weights: [31.54, 179.82] class: 1
"""Examine the important features for the best performing model"""
feature_names = x_train.columns
importances = opt_dtree.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], align='center')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()